### This R script is to fill out Table 2

# open necessary packages
require(dplyr)
library(Hmisc)
library(fixest)

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")
data_19 <- read.csv("core_19.csv")

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
data_19 <- data_19 %>% rename(cdis1=cdis)

## import list of teacher variables
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.name
varnames <- varnames[!is.na(varnames)] 

# add simple_avg to varnames
varnames <- c(varnames,"simple_avg")

### 1) Need to calculate weighted average scores and std devs for each teacher variable
## Label the weights you'll use as "weight"
data_19 <- mutate(data_19, weight=teacher_n)

### a) weighted means
## write a function that calculates weighted average
weighted_avg <- function(x,var){ # be sure to type the variable name with "" surrounding it
  if(!is.null(x[[var]])){
    return(weighted.mean(x[[var]],x[["weight"]], na.rm=T))
  } else{return(NA)}}

# put the calculated means in a table
Table_2_means <- matrix(NA, length(varnames), 3)
row.names(Table_2_means) <- varnames
colnames(Table_2_means) <- c("Total", "High_SES", "Low_SES")

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_means[i,"Total"] <- weighted_avg(data_19, var)
}

### b) weighted std devs
### write function to calculate weighted standard deviations
weighted_sd <- function(x,var){ # be sure to type the variable name with "" surrounding it
  if(!is.null(x[[var]])){
    return(sqrt(wtd.var(x[[var]],x[["weight"]])))
  } else{return(NA)}}

# put the calculated sd's in a table
Table_2_std_dev <- matrix(NA, length(varnames), 3)
row.names(Table_2_std_dev) <- varnames
colnames(Table_2_std_dev) <- c("Total", "High_SES", "Low_SES")

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_std_dev[i,"Total"] <- weighted_sd(data_19, var)
}

### 2) Repeat Step 1 with High SES and Low SES groups
## i) separate into high and low SES using median IPR(130)
# find median_ipr_130
median_ipr_130 <- median(data_19$ipr_130, na.rm=T)
# Make a dummy=1 if ipr_130 > median_ipr_130, 0 otherwise.
data_19$above_median_ipr_130 <- ifelse(data_19$ipr_130>=median_ipr_130, 1, 0) 

count(data_19,above_median_ipr_130) # checking to see if the sample is split in half
# at the median

# ii) run analysis on high ses -------------------------------------------------
high_ses <- filter(data_19, above_median_ipr_130==0) # above_median_ipr_130==0 because bigger ipr_130 values mean there
# are more students living at or below 130% of the poverty line

### a) weighted means
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_means[i,"High_SES"] <- weighted_avg(high_ses, var)
}

### b) weighted std devs
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_std_dev[i,"High_SES"] <- weighted_sd(high_ses, var)
}

# iii) run analysis on low ses -------------------------------------------------
low_ses <- filter(data_19, above_median_ipr_130==1)

### a) weighted means
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_means[i,"Low_SES"] <- weighted_avg(low_ses, var)
}

### b) weighted std devs
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_std_dev[i,"Low_SES"] <- weighted_sd(low_ses, var)
}

### 3) Run statistical significance tests for cols 2 and 3, and store p-values. If p-value <= .05, add "*" to the
### corresponding value in the table. Calculate a table of p-values (one for each variable and difference)

# For test on construct c and difference over Low and High_SES,
# a) Stack Low and High_SES data
# b) Run regressions of the following form for each variable:

# Y_ict = B0 + B1*T_t + epsilon_ict
# Where i = school, c = construct (survey variable), t = group (Low or High_SES) 
# T is a group dummy (= 1 if data is from High_SES group)
# run the regressions as weighted (GLS) regressions and cluster by district

# c) store the p-value on B1

## write function to run statistical significance tests:
# data_1 is the low SES value, data_2 is the high ses value
wtd.reg <- function(data_1, data_2, var){ # be sure to type variable name with ""
  if(is.null(data_1[[var]]) | is.null(data_2[[var]])){ # add clause that gives an NA if the variable is not available 
    output <- c(NA,NA); names(output) <- c("coefficient","p-value")}
  else{
    stack_1 <- data.frame(construct=data_1[[var]], weight=data_1[["weight"]],leaid=data_1[["leaid"]])
    stack_2 <- data.frame(construct=data_2[[var]], weight=data_2[["weight"]],leaid=data_2[["leaid"]])
    stack_1$group_dummy <- 0
    stack_2$group_dummy <- 1
    stack <- rbind(stack_1, stack_2)
    
    regression <- feols(construct ~ group_dummy,  cluster=~leaid, data=stack, weights=~weight)
    coeff_B1 <- coef(regression)["group_dummy"]
    p_val_B1 <- pvalue(regression)["group_dummy"]
    output <- c(coeff_B1,p_val_B1); names(output) <- c("coefficient","p-value")}
  return(output)
}
test <- wtd.reg(low_ses, high_ses, "colb")

# The regression worked if the coefficient on B1 equals the difference in means
# from the table. Check this:
row <- which(rownames(Table_2_means)=="colb")
diff_in_means <- Table_2_means[row,"High_SES"]-Table_2_means[row,"Low_SES"]
test[1]-diff_in_means # should be very small amount

### get p-val for each variable 
Table_2_p_vals <- rep(NA, nrow(Table_2_means))
names(Table_2_p_vals) <- rownames(Table_2_means)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_2_p_vals[i] <- wtd.reg(low_ses, high_ses, var)[2]
}
Table_2_p_vals

# check work
Table_2_p_vals>1
Table_2_p_vals<0

# calculate statistical significance at 5% level:
stat_sig <- Table_2_p_vals<=.05

### 4) Combine means, standard deviations and statistical significance into single table:
### For Col 1, want: Mean (St Dev)
### For Col 2/3, want: Mean* (St Dev), with the * added if result is statistically significant

## Remove 'data' variable from means, std dev and stat sig
Table_2_means <- Table_2_means[-which(rownames(Table_2_means)=='data'),]
Table_2_std_dev <- Table_2_std_dev[-which(rownames(Table_2_std_dev)=='data'),]
stat_sig <- stat_sig[-which(names(stat_sig)=='data')]

### Make Table:
Table_2 <- matrix(NA, nrow(Table_2_means), 3)
Table_2 <- as.data.frame(Table_2)
colnames(Table_2) <- colnames(Table_2_means)
rownames(Table_2) <- rownames(Table_2_means)

## Col 1:
# round Table_2_means and Table_2_std_dev
Table_2_means <- round(Table_2_means)
Table_2_std_dev <- round(Table_2_std_dev)
sprintf("%s (%s)", Table_2_means[,"Total"], Table_2_std_dev[,"Total"])
Table_2$Total <- sprintf("%s (%s)", Table_2_means[,"Total"], Table_2_std_dev[,"Total"])

## Col 2 and 3:
# make a dataframe with asterisks where result is statistically significant:
asterisk <- ifelse(stat_sig,"*","")

Table_2$High_SES <- sprintf("%s%s (%s)", Table_2_means[,"High_SES"], asterisk, Table_2_std_dev[,"High_SES"])
Table_2$Low_SES <- sprintf("%s%s (%s)", Table_2_means[,"Low_SES"], asterisk, Table_2_std_dev[,"Low_SES"])

## Export result:
write.csv(Table_2, "Table_2.csv")
